
/*********

inefficiencies? / tried & didn't help :

	0. EZW uses contexts to code down to the MSB, then uses only order0
		on the bits below MSB  (this is "if (A&donemask)" for us).
		use binary alphabet? (would allow more proper use of top quartet)
	  tried it; compression *almost* reaches my level, but speed is hurt.
		saved in bak5\ezdct.bitwise

	1. use one bit of 'b' in the context?  -> nope!

	2. top-level (b==0) quartet has no parent; we still use the same code
		structure.  "ezdct2" separates out the first byte for DPCM coding,
		but this actually *hurt*.  Something is still imperfect here.

	3. grey or "blue" coding to make the bitplanes more reflective
			of values?  -> hurt

	4. "blocks then bits" hurt

	5. decode to midoints ? good idea, but never used!

	6.	only using 8 context buckets now; I don't
			think we're doing much more than order0
			coding of bits! (found to be better than 16 or 32)
		in particular, the information of whether the current
			pixels history is On or Off seems to be irrelevant,
				(using only one bit for history hurts 0.002 ,
				 not using it at all hurts 0.06)
			only the parent bit really matters (means the tree structuring *is* good).
		for example, we use "nextmask" on parent now;
			using bitmask or donemask on parent hurts quite a bit (0.02)

*********/

#define DECODE_MIDPOINTS	/** almost irrelevant; even at very low rate,
							 * 	 coding gets down to the lowest bit plane
							***/

#define BITPLANE_ADAPT	// halve contexts after each plane, helps 0.004

#define DPCM_TRANSFORM	// major!

//#define NO_CODING

#define VERBOSE
//#define DEBUG
//#define DO_LOG
//#define MAKE_OUT_RAW
//#define REPORT_PLANES

//#define CODE_SIGNS //hurts on small files, helps a teeny on big ones

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <crbinc/inc.h>
#include <crbinc/fileutil.h>
#include <crbinc/cindcatr.h>
#include <crbinc/timer.h>
#include <crbinc/bbitio.h>
#include <crbinc/arithc.h>
#include <crbinc/scontext.h>
#include <crbinc/context.h>

void dbf(void) { errputs("dbf"); }

#include "dct.h"
#include "quantize.jkfdsh"

/** tree-structures index **/

static int zag_x[] = {
 0,  1,  1,  0,  2,  3,  3,  2,  0,  1,  1,  0,  2,  3,  3,  2, 
 4,  5,  5,  4,  6,  7,  7,  6,  4,  5,  5,  4,  6,  7,  7,  6, 
 0,  1,  1,  0,  2,  3,  3,  2,  0,  1,  1,  0,  2,  3,  3,  2, 
 4,  5,  5,  4,  6,  7,  7,  6,  4,  5,  5,  4,  6,  7,  7,  6 };
static int zag_y[] = {
 0,  0,  1,  1,  0,  0,  1,  1,  2,  2,  3,  3,  2,  2,  3,  3, 
 0,  0,  1,  1,  0,  0,  1,  1,  2,  2,  3,  3,  2,  2,  3,  3, 
 4,  4,  5,  5,  4,  4,  5,  5,  6,  6,  7,  7,  6,  6,  7,  7, 
 4,  4,  5,  5,  4,  4,  5,  5,  6,  6,  7,  7,  6,  6,  7,  7 };

#define ZAG_INDEX(lptr,x,z) (lptr[zag_x[(z)] + (x) + zag_y[(z)]])

#define cleanup(str) if ( 0 ) ; else { errputs(str); exit(10); }

#ifdef DO_LOG
#define LOG(x)	if (0) ; else { x }
#else
#define LOG(x)	if (1) ; else { x }
#endif

#define TRANS_MAX			(0xFFF)
#define TRANS_MAX_BPN		(11)
#define TRANS_MAX_EV		(0xFFE)
#define TRANS_TOPBIT		(0x800)
#define TRANS_TOPBIT_BPN	(10)
#define TRANS_LOWBIT		(0x002)	// since 001 is the sign
#define VAL_MAX 			(TRANS_TOPBIT)

#define data_trans(val) ( ((val) < 0 ) ? ( 1+min(TRANS_MAX_EV,(-val-val))) : (min(TRANS_MAX_EV,(val+val))) )
#define trans_data(val) ( ((val) & 1 ) ? (-((val)>>1)) : (((val)>>1)) )

/** the value "1" is *never* used in trans values.  The bottom bit is only
		on if one of the higher bits is on **/

int log_nothing;
#define RESET_LOG() if ( 0 ) ; else { log_nothing=0; };

#define CODE_CONTEXTS		8	// 8 isn't much worse, and is a bit faster

#define MAKE_CONTEXT(context,parent,a,b,c,d) if ( 0 ) ; else {	\
	context = a+b+c+d; if ( context == 4 ) context = 3;	if ( parent ) context += 4;	\
}

#define CODE_ALPHABET		16	// 4 bits

#define ORDER1_TOTMAX		10000
#define ORDER1_INC			25

#ifdef CODE_SIGNS
#define SIGN_CONTEXTS		3
#define SIGNORDER0_TOTMAX	1000
#define SIGNORDER0_INC		5
#endif

#define PSNR_MAX 	(48.165)	//(10*log10(256^2))

int tune_param = 1;

ubyte **raw = NULL,**rawout = NULL;
uword **trans = NULL,**transout = NULL;
ubyte *comp = NULL;
FILE *rawF = NULL;
struct FAI * FAI = NULL;
struct BitIOInfo * BII = NULL;
#ifdef CODE_SIGNS
scontext **signorder0 = NULL;
#endif
scontext **order1 = NULL;

int width,height,rawsize,complen,totsize,num_planes; /** globals **/

void ExitFunc(void);
void coder_adapt(void);
void encode(int sym,int context);
void encodeSign(bool sym,int prev);
int decode(int context);
bool decodeSign(int prev);

int code_image(ubyte **raw,ubyte **rawout,int width,int height,int num_planes,
	int * qtable,int req_len);

void dct_image(ubyte **raw,uword **trans,int width,int height,int num_planes,
		int * quant_table);
void idct_image(ubyte **rawout,uword **trans,int width,int height,int num_planes,
		int * quant_table);

void TheImageAnalyzer(ubyte **original,ubyte **vs,
					int num_planes,int width,int height,
					float ratio,FILE *sio);

int mse_image(ubyte **original,ubyte **vs,int width,int height,int num_planes);

void init_allocs(void);
void init_coders(void);
void init_coders_write(void);
void init_coders_read(void);
int done_coders(void);
void free_coders(void);

int main(int argc,char *argv[])
{
char *t1,*t2;
int comp_tot_len,req_len,i;
clock_t startclock,stopclock;
int quantizer,bitrate;
int qtable[DCTBLOCK];

	errputs("ezdct v0.5, (c) 1998 by Charles Bloom");

	if ( argc <= 3 ) {
		errputs("ezdct <raw file> <width> <height> [planes] [quantizer] [bitrate*100] [tune param]");
		errputs(" use quantizer == 0 to let the program optimally choose a quantizer ");
		exit(10);
	}

	if ( atexit(ExitFunc) )
	  cleanup("Couldn't install exit trap!");

	if ( ( rawF = fopen(argv[1],"rb") ) == NULL )
		cleanup("open raw failed");

	width = atol(argv[2]);
	height = atol(argv[3]);

	if ( ((width/DCTLINE)*DCTLINE) != width )
		cleanup("size must be divable by 8");
	if ( ((height/DCTLINE)*DCTLINE) != height )
		cleanup("size must be divable by 8");

	if ( argc > 4 )	num_planes = atol(argv[4]);
	else			num_planes = 1;

	if ( argc > 5 )	quantizer = min(max(0,atol(argv[5])),100);
	else			quantizer = 0;

	if ( argc > 6 )	bitrate = min(max(1,atol(argv[6])),1000);
	else			bitrate = 100;

	if ( argc > 7 )	tune_param = atol(argv[7]);
	else			tune_param = 1;

	rawsize = width * height;
	totsize = rawsize*num_planes;
	req_len = (bitrate*totsize)/800;
	complen = req_len + (rawsize/8);

	init_allocs();

	for(i=0;i<num_planes;i++) {
		if ( !FReadOk(rawF,raw[i],rawsize) )
			errputs("fread short! continuing..");
	}
	fclose(rawF); rawF = NULL;

	for(i=0;i<DCTBLOCK;i++) zag_y[i] *= width;

#define qtable_frac qtable_none
	/** we use uniform quantizers ; you may scale them up by a table of
		fractions, but that won't help PSNR ratings **/

	if ( quantizer == 0 ) {
		int step,out,err,out2,err2,out3,err3;

		quantizer = 8; 
			/** small for faster convergence, 
			 * 		but may be too small for extremely low bitrate
			 * 	(maximum is this*2)
			***/

		for(i=0;i<DCTBLOCK;i++) {
			qtable[i] = quantizer*qtable_frac[i];
			if ( qtable[i] < 1 ) qtable[i] = 1;
		}
		out = code_image(raw,rawout,width,height,num_planes,qtable,req_len);
		err = mse_image(raw,rawout,width,height,num_planes);

		for(step = (quantizer>>1);step>0;step>>=1) {
#ifdef VERBOSE
			printf("out = %d, err = %d, quantizer = %d, step = %d\n",out,err,quantizer,step);
#endif

			for(i=0;i<DCTBLOCK;i++) {
				qtable[i] = (quantizer+step)*qtable_frac[i];
				if ( qtable[i] < 1 ) qtable[i] = 1;
			}
			out2 = code_image(raw,rawout,width,height,num_planes,qtable,req_len);
			err2 = mse_image(raw,rawout,width,height,num_planes);

			for(i=0;i<DCTBLOCK;i++) {
				qtable[i] = (quantizer-step)*qtable_frac[i];
				if ( qtable[i] < 1 ) qtable[i] = 1;
			}
			out3 = code_image(raw,rawout,width,height,num_planes,qtable,req_len);
			err3 = mse_image(raw,rawout,width,height,num_planes);

			if ( err2 < err && err2 < err3 ) {
				quantizer += step;
				out = out2; err= err2;
			} else if ( err3 < err ) {
				quantizer -= step;
				out = out3; err= err3;
			} else if ( err2 == err && err3 == err ) {
				if ( out2 < out && out2 < out3 ) {
					quantizer += step;
					out = out2; err= err2;
				} else if ( out3 < out ) {
					quantizer -= step;
					out = out3; err= err3;
				}
			}
		}

#ifdef VERBOSE
		printf("chose quantizer = %d\n",quantizer);
#endif

	} else {

		for(i=0;i<DCTBLOCK;i++) {
			qtable[i] = quantizer*qtable_frac[i];
			if ( qtable[i] < 1 ) qtable[i] = 1;
		}
	}

	RESET_LOG();

	startclock = clock();

	comp_tot_len = code_image(raw,rawout,width,height,num_planes,qtable,req_len);

	stopclock = clock();

	printf("%-13s :",FilePart(argv[1]));
	TheCompressionIndicator(totsize,comp_tot_len,stdout);
#ifdef VERBOSE
	printf(", %6ld byps\n",NumPerSec(totsize*2,stopclock-startclock));
#endif
	TheImageAnalyzer(raw,rawout,num_planes,width,height,((float)totsize/comp_tot_len),stdout);

#ifdef MAKE_OUT_RAW
	if ( ( rawF = fopen("out.raw","wb") ) == NULL )
		cleanup("open rawout failed");
	for(i=0;i<num_planes;i++) {
		if ( !FWriteOk(rawF,rawout[i],rawsize) )
			errputs("fwrote short! continuing..");
	}
	fclose(rawF); rawF = NULL;
#endif //MAKE_OUT_RAW

return 0;
}

int code_image(ubyte **raw,ubyte **rawout,int width,int height,int num_planes,
	int * qtable,int req_len)
{
int coded_len=0;
int temp,pnum;

	dct_image(raw,trans,width,height,num_planes,qtable);

#ifdef NO_CODING /*{*/
	/*#*/ {
	uword *tptr,*toptr;
	int r,rawsize=width*height;

	for(pnum=0;pnum<num_planes;pnum++) {
		tptr = trans[pnum];
		toptr = transout[pnum];
		for(r=rawsize;r--;)
			*toptr++ = *tptr++;
	}
	/*#*/ }

#else /*}{*/

	init_coders_write();

	/* encode */
	/*#*/ {
	uword *transline,*transplane;
	int x,y,b;
	int context,block,parent,parent_sign;
	int bitpn,bitmask,donemask,nextmask,A,B,C,D;
	int trans_top,top_bitpn;
#ifdef REPORT_PLANES
	int last_comp_len=0,cur_comp_len;
#endif

#ifdef DPCM_TRANSFORM	/** this is a lossless transform, not coding **/
	/*#*/ {
	int val,pred; uword *prevline;
	for(pnum=0;pnum<num_planes;pnum++) {
		transplane = trans[pnum];
		for(y=(height-DCTLINE);y>=0;y -= DCTLINE) {
			transline = transplane + y*width;
			prevline  = transline - DCTLINE*width;
			for(x=(width-DCTLINE);x>=0;x -= DCTLINE) {
				val = trans_data(transline[x]);

				if ( x == 0 && y == 0 ) {	pred = 0;
				} else if ( y == 0 ) {		pred = trans_data(transline[x-DCTLINE]);
				} else if ( x == 0 ) {		pred = trans_data(prevline[x]);
				} else { int a,b;
					a = trans_data(transline[x-DCTLINE]);
					b = trans_data(prevline[x]);
					pred = (a + b)>>1;
				}

				val -= pred;

				/** warning : val can now be up to 2* max[dct_output] **/
#ifdef DEBUG
				if ( abs(val) > VAL_MAX ) errputs("capped");
#endif

				transline[x] = data_trans(val);
			}
		}
	}
	/*#*/ }
#endif //DPCM_TRANSFORM


	trans_top=0;
	for(pnum=0;pnum<num_planes;pnum++) { transline = trans[pnum];
		for(x=rawsize;x--;) {
			A = *transline++;
			if ( A > trans_top ) trans_top = A;
		}
	}
	
	for(top_bitpn=0;(1<<(top_bitpn+1))<trans_top;top_bitpn++) ;
	arithEncode(FAI,top_bitpn,top_bitpn+1,TRANS_MAX_BPN);
	trans_top = 1<<top_bitpn;

	nextmask=0; bitpn = top_bitpn;
	for(bitmask = trans_top;bitmask>=TRANS_LOWBIT;bitmask>>=1) {
		donemask = nextmask; bitpn--;
		nextmask = donemask + bitmask;
		for(b=0;b<16;b++) { /** 16 4x4 blocks **/

			for(pnum=0;pnum<num_planes;pnum++) {
				transplane = trans[pnum];
				for(y=0;y<height;y += DCTLINE) {
					transline = transplane + y*width;
					for(x=0;x<width;x += DCTLINE) {

						/** we scan through all spacial locations of
							a certain frequency (so arithcoders adapt)
							but use the previous zigzag as context **/

						// b is the parent, in scan index
						// the children are (b*4)+0,1,2,3

						A = ZAG_INDEX(transline,x, (b<<2)+0 );
						B = ZAG_INDEX(transline,x, (b<<2)+1 );
						C = ZAG_INDEX(transline,x, (b<<2)+2 );
						D = ZAG_INDEX(transline,x, (b<<2)+3 );

						if ( b == 0 ) {
							/* 0th block has no parent, and A is the DC */
							parent = TRANS_MAX;
						} else {
							parent = ZAG_INDEX(transline,x,b);
						}

						MAKE_CONTEXT(context,(parent&nextmask),((A & donemask)?1:0),((B & donemask)?1:0),((C & donemask)?1:0),((D & donemask)?1:0));
						
						block = 0; // 4 bits
						if ( A & bitmask ) block += 1;
						if ( B & bitmask ) block += 2;
						if ( C & bitmask ) block += 4;
						if ( D & bitmask ) block += 8;

						encode(block,context);

						/** dependencies of B on C, etc. are implicitly used in the blocking **/

						 // only code from parent's sign if it has been sent
						if ( parent & nextmask )	parent_sign = parent & 1;
						else						parent_sign = 2;

						if ( (A & bitmask) && !(A & donemask) ) encodeSign(A&1,parent_sign);
						if ( (B & bitmask) && !(B & donemask) ) encodeSign(B&1,parent_sign);
						if ( (C & bitmask) && !(C & donemask) ) encodeSign(C&1,parent_sign);
						if ( (D & bitmask) && !(D & donemask) ) encodeSign(D&1,parent_sign);
					}

				if ( (BitIO_GetPos(BII)) > req_len )
					goto break_encoding;
				}
			}
		}

#ifdef REPORT_PLANES
		cur_comp_len = BitIO_GetPos(BII) - 2;
		printf("bp %-10d :",bitpn);	TheCompressionIndicator(((width*height)>>3),(cur_comp_len - last_comp_len),stdout); puts("");
		last_comp_len = cur_comp_len;
#endif // REPORT_PLANES

#ifdef BITPLANE_ADAPT
		coder_adapt();
#endif

	}

		break_encoding:
			donemask=0; //** dummy for goto

	/*#*/ }

	coded_len = done_coders();

	free_coders();

	for(pnum=0;pnum<num_planes;pnum++) {
		MemClear(transout[pnum],rawsize*sizeof(uword));
	}

	init_coders_read();

	/* decode */
	/*#*/ {
	uword *transline,*transplane;
	int x,y,b;
	int context,block,parent,parent_sign;
	int bitpn,bitmask,donemask,nextmask,A,B,C,D;
	int trans_top,top_bitpn;

	top_bitpn = arithGet(FAI,TRANS_MAX_BPN);
	arithDecode(FAI,top_bitpn,top_bitpn+1,TRANS_MAX_BPN);
	trans_top = 1<<top_bitpn;

	nextmask=0; bitpn = top_bitpn;
	for(bitmask = trans_top;bitmask>=TRANS_LOWBIT;bitmask>>=1) {
		donemask = nextmask; bitpn--;
		nextmask = donemask + bitmask;
		for(b=0;b<16;b++) { /** 16 4x4 blocks **/
			for(pnum=0;pnum<num_planes;pnum++) {
				transplane = transout[pnum];
				for(y=0;y<height;y += DCTLINE) {
					transline = transplane + y*width;
					for(x=0;x<width;x += DCTLINE) {

						A = ZAG_INDEX(transline,x, (b<<2)+0 );
						B = ZAG_INDEX(transline,x, (b<<2)+1 );
						C = ZAG_INDEX(transline,x, (b<<2)+2 );
						D = ZAG_INDEX(transline,x, (b<<2)+3 );

						if ( b == 0 ) {
							parent = TRANS_MAX;
						} else {
							parent = ZAG_INDEX(transline,x,b);
						}

						MAKE_CONTEXT(context,(parent&nextmask),((A & donemask)?1:0),((B & donemask)?1:0),((C & donemask)?1:0),((D & donemask)?1:0));
						
						block = decode(context);

						 // only code from parent's sign if it has been sent
						if ( parent & nextmask )	parent_sign = parent & 1;
						else						parent_sign = 2;

						if ( block & 1 ) { A += bitmask; if ( !(A & donemask) ) A += decodeSign(parent_sign); }
						if ( block & 2 ) { B += bitmask; if ( !(B & donemask) ) B += decodeSign(parent_sign); }
						if ( block & 4 ) { C += bitmask; if ( !(C & donemask) ) C += decodeSign(parent_sign); }
						if ( block & 8 ) { D += bitmask; if ( !(D & donemask) ) D += decodeSign(parent_sign); }

						ZAG_INDEX(transline,x, (b<<2)+0 ) = A;
						ZAG_INDEX(transline,x, (b<<2)+1 ) = B;
						ZAG_INDEX(transline,x, (b<<2)+2 ) = C;
						ZAG_INDEX(transline,x, (b<<2)+3 ) = D;
					}

				if ( (BitIO_GetPos(BII)) > req_len )
					goto break_decoding; /** don't waste our time any more **/
				}
			}
		}
#ifdef BITPLANE_ADAPT
		coder_adapt();
#endif
	}

	break_decoding:
		/** push values up to midpoints **/

#ifdef DECODE_MIDPOINTS
		bitmask >>= 1; bitmask--;
		if ( bitmask >= TRANS_LOWBIT ) {
			for(pnum=0;pnum<num_planes;pnum++) { transline = trans[pnum];
				for(x=rawsize;x--;) {
					if ( *transline > 0 ) *transline += bitmask;
					transline++;
				}
			}
		}
#endif // DECODE_MIDPOINTS

#ifdef DPCM_TRANSFORM /** undo it **/
	/*#*/ {
	int val,pred; uword *prevline;
	for(pnum=0;pnum<num_planes;pnum++) {
		transplane = transout[pnum];
		for(y=0;y<height;y += DCTLINE) {
			transline = transplane + y*width;
			prevline  = transline - DCTLINE*width;
			for(x=0;x<width;x += DCTLINE) {
				val = trans_data(transline[x]);

				if ( x == 0 && y == 0 ) {	pred = 0;
				} else if ( y == 0 ) {		pred = trans_data(transline[x-DCTLINE]);
				} else if ( x == 0 ) {		pred = trans_data(prevline[x]);
				} else { int a,b;
					a = trans_data(transline[x-DCTLINE]);
					b = trans_data(prevline[x]);
					pred = (a + b)>>1;
				}

				val += pred;

				transline[x] = data_trans(val);
			}
		}
	}
	/*#*/ }
#endif //DPCM_TRANSFORM

	/*#*/ }

	free_coders();

#endif // NO_CODING /*}*/

	idct_image(rawout,transout,width,height,num_planes,qtable);

return coded_len;
}

void coder_adapt(void)
{
int context;
	for(context=0;context<CODE_CONTEXTS;context++) {
		scontextHalve(order1[context]);
		scontextHalve(order1[context]);
	}
}

void encode(int sym,int context)
{
scontextEncode(order1[context],sym);
}

void encodeSign(bool sign,int parent)
{
#ifdef CODE_SIGNS
	scontextEncode(signorder0[parent],sign);
#else
	arithBit(FAI,sign);
#endif
}

int decode(int context)
{
return scontextDecode(order1[context]);
}
bool decodeSign(int prev)
{
#ifdef CODE_SIGNS
	return scontextDecode(signorder0[prev]);
#else
	return arithGetBit(FAI);
#endif
}

void ExitFunc(void)
{
int i;

free_coders();

smartfree(comp);

if ( raw ) {
	for(i=0;i<num_planes;i++) {
		smartfree(raw[i]);
	}
	free(raw);
}
if ( rawout ) {
	for(i=0;i<num_planes;i++) {
		smartfree(rawout[i]);
	}
	free(rawout);
}
if ( trans ) {
	for(i=0;i<num_planes;i++) {
		smartfree(trans[i]);
	}
	free(trans);
}
if ( transout ) {
	for(i=0;i<num_planes;i++) {
		smartfree(transout[i]);
	}
	free(transout);
}


if ( rawF ) fclose(rawF);

}

int mse_image(ubyte **original,ubyte **vs,int width,int height,int num_planes)
{
int diffs[256];
int diff,i,totsq,pnum;
int rawsize;

	rawsize = width*height;

	MemClear(diffs,256*sizeof(int));

	for(pnum=0;pnum<num_planes;pnum++) {
		ubyte *rptr,*vptr;
		rptr = original[pnum]; vptr = vs[pnum];
		for(i=rawsize;i--;) {
			diff = *rptr++ - *vptr++;
			if ( diff < 0 ) diff = -diff;
			diffs[diff] ++;
		}
	}

	totsq = 0;
	for(i=1;i<256;i++) {
		if ( diffs[i] > 0 ) {
			totsq += i*i*diffs[i];
		}
	}

return totsq;
}

void TheImageAnalyzer(ubyte **original,ubyte **im2,
					int num_planes,int width,int height,
					float ratio,FILE *sio)
{
int diffs[256];
int diff,i,tot,totsq,max,pnum,j;
int rawsize,totsize;
float mse,me,rmse,mse_percep;

rawsize = width*height;
totsize = width*height*num_planes;

	MemClear(diffs,256*sizeof(int));

	for(pnum=0;pnum<num_planes;pnum++) {
		ubyte *rptr,*vptr;
		rptr = original[pnum]; vptr = im2[pnum];
		for(i=rawsize;i--;) {
			diff = *rptr++ - *vptr++;
			if ( diff < 0 ) diff = -diff;
			diffs[diff] ++;
		}
	}

	tot = totsq = max = 0;
	for(i=1;i<256;i++) {
		if ( diffs[i] > 0 ) {
			max = i;
			tot += i * diffs[i];
			totsq += i*i * diffs[i];
		}
	}

	me = (float)tot/totsize;
	mse = (float)totsq/totsize;
	rmse = sqrt(mse);

	for(pnum=0;pnum<num_planes;pnum++) {
		int x,y,av1,av2;
		ulong totds;
		ubyte *line1,*pline1,*nline1;
		ubyte *line2,*pline2,*nline2;
		line1 = original[pnum]; nline1 = line1+width;
		line2 = im2[pnum]; nline2 = line2+width;
		totds = 0;
		for(y=1;y<(width-1);y++) {
			pline1 = line1; line1 = nline1; nline1 += width;
			pline2 = line2; line2 = nline2; nline2 += width;
			for(x=1;x<(width-1);x++) {
				av1 = line1[x] + line1[x-1] + line1[x+1] + pline1[x] + nline1[x];
				av2 = line2[x] + line2[x-1] + line2[x+1] + pline2[x] + nline2[x];
				diff = (av1 - av2); diff *= diff;
				if ( abs( line1[x-1] - line1[x+1] ) < 5 &&
 					 abs( pline1[x] - nline1[x] ) < 5 ) totds += diff + diff;
				else totds += diff;
			}
		}

		mse_percep = (float)totds/(25.0*totsize);
	}

#ifdef VERBOSE
	fprintf(sio,"error: av= %.2f,max= %d,mse= %.3f,rmse= %.2f,psnr= %.2f,perc= %.2f\n",me,max,mse,rmse,(PSNR_MAX - 10*log10(mse)),mse_percep);
	fprintf(sio,"performance: MSE = %f , RMSE = %f , percep = %f\n",(ratio/mse),(ratio/rmse),(ratio/mse_percep));
		/** use MSE in high error regime, RMS in low error **/
#else
	fprintf(sio,"RMSE=%f,PSNR=%f\n",rmse,(PSNR_MAX - 10*log10(mse)));;
#endif

}

void dct_image(ubyte **raw,uword **trans,int width,int height,int num_planes,
		int * quant_table)
{
ubyte *plane,*line,*lptr;
uword *transline,*transplane,*tptr;
RAWDATA rawblock[DCTBLOCK],*rptr;
DCTDATA dctblock[DCTBLOCK],*bptr;
int x,y,pnum,i;

	dct_init(quant_table);

	for(pnum=0;pnum<num_planes;pnum++) {
		plane = raw[pnum];
		transplane = trans[pnum];
		for(y=0;y<height;y += DCTLINE) {
			line = plane + y*width;
			transline = transplane + y*width;
			for(x=0;x<width;x += DCTLINE) {

				rptr = rawblock;
				for(i=0;i<DCTLINE;i++) {
					lptr = line + x + i*width;
					unroll_dctline(*rptr++ = *lptr++);
				}
	
				dct(rawblock,dctblock);

				bptr = dctblock;
				for(i=0;i<DCTLINE;i++) {
					tptr = transline + x + i*width;
					unroll_dctline(*tptr++ = data_trans(*bptr); bptr++);
				}
			}
		}
	}


}


void idct_image(ubyte **rawout,uword **trans,int width,int height,int num_planes,
		int * quant_table)
{
ubyte *plane,*line,*lptr;
uword *transline,*transplane,*tptr;
RAWDATA rawblock[DCTBLOCK],*rptr;
DCTDATA dctblock[DCTBLOCK],*bptr;
int x,y,pnum,i;

	idct_init(quant_table);

	for(pnum=0;pnum<num_planes;pnum++) {
		plane = rawout[pnum];
		transplane = trans[pnum];
		for(y=0;y<height;y += DCTLINE) {
			line = plane + y*width;
			transline = transplane + y*width;
			for(x=0;x<width;x += DCTLINE) {

				bptr = dctblock;
				for(i=0;i<DCTLINE;i++) {
					tptr = transline + x + i*width;
					unroll_dctline(*bptr++ = trans_data(*tptr); tptr++;);
				}

				idct(dctblock,rawblock);

				rptr = rawblock;
				for(i=0;i<DCTLINE;i++) {
					lptr = line + x + i*width;
					unroll_dctline(*lptr++ = *rptr++);
				}
	
			}
		}
	}

}

void init_allocs(void)
{
int i;
	if ( (raw = malloc(sizeofpointer*num_planes)) == NULL )
		cleanup("malloc failed");
	if ( (rawout = malloc(sizeofpointer*num_planes)) == NULL )
		cleanup("malloc failed");
	if ( (trans = malloc(sizeofpointer*num_planes)) == NULL )
		cleanup("malloc failed");
	if ( (transout = malloc(sizeofpointer*num_planes)) == NULL )
		cleanup("malloc failed");

	for(i=0;i<num_planes;i++) {
		if ( (raw[i] = malloc(rawsize)) == NULL )
			cleanup("malloc failed");
		if ( (rawout[i] = malloc(rawsize)) == NULL )
			cleanup("malloc failed");
		if ( (trans[i] = newarray(uword,rawsize)) == NULL )
			cleanup("malloc failed");
		if ( (transout[i] = newarray(uword,rawsize)) == NULL )
			cleanup("malloc failed");
	}

	if ( (comp = malloc(complen)) == NULL )
		cleanup("malloc failed");
}

void init_coders_write(void)
{
	MemClear(comp,complen);

	init_coders();

	FastArithEncodeCInit(FAI);
}

void init_coders_read(void)
{
	init_coders();

	BitIO_InitRead(BII);

	FastArithDecodeCInit(FAI);
}

void init_coders(void)
{
int i;
	if ( (BII = BitIO_Init(comp)) == NULL )
		cleanup("BitIOInit failed!");
	if ( (FAI = FastArithInit(BII)) == NULL )
		cleanup("FastArithInit failed!");

	if ( (order1 = AllocMem(CODE_CONTEXTS*sizeofpointer,MEMF_CLEAR)) == NULL )
		cleanup("Order1_Init failed!");

	for(i=0;i<CODE_CONTEXTS;i++) {
		if ( (order1[i] = scontextCreate(FAI,CODE_ALPHABET,0,
				ORDER1_TOTMAX,ORDER1_INC,true)) == NULL )
			cleanup("context creation failed!");
	}

#ifdef CODE_SIGNS
	if ( (signorder0 = AllocMem(SIGN_CONTEXTS*sizeofpointer,MEMF_CLEAR)) == NULL )
		cleanup("Order1_Init failed!");
	for(i=0;i<SIGN_CONTEXTS;i++) {
		if ( (signorder0[i] = scontextCreate(FAI,2,
				0,SIGNORDER0_TOTMAX,SIGNORDER0_INC, true)) == NULL )
			cleanup("Order0_Init failed!");
	}
#endif
}

int done_coders(void)
{
int complen;
	FastArithEncodeCDone(FAI);
	complen = 0;
	complen += BitIO_FlushWrite(BII) - 2;
return complen;
}

void free_coders(void)
{
int i;

if ( order1 ) {
	for(i=0;i<CODE_CONTEXTS;i++)
		if ( order1[i] ) scontextFree(order1[i]);
	free(order1);
	order1 = NULL;
}
#ifdef CODE_SIGNS
if ( signorder0 ) {
	for(i=0;i<SIGN_CONTEXTS;i++)
		if ( signorder0[i] ) scontextFree(signorder0[i]);
	free(signorder0);
	signorder0 = NULL;
}
#endif

if ( FAI ) FastArithCleanUp(FAI); FAI = NULL;
if ( BII ) BitIO_CleanUp(BII); BII = NULL;

}
